options(scipen=999)

library(tidyverse)
library(caret)
library(knitr) 
library(pscl)
library(plotROC)
library(pROC)
library(sf)
library(viridis)
library(ggplot2)
library(grid)
library(gridExtra)



palette5 <- c("#3B668C","#8BBBD9","#485923","#728C58","#F2C791")
palette4 <- c("#3B668C","#8BBBD9","#485923","#F2C791")
palette2 <- c("#3B668C","#F2BF27")



mapTheme <- function(base_size = 12) {
  theme(
    text = element_text( color = "black"),
    plot.title = element_text(size = 14,colour = "black"),
    plot.subtitle=element_text(face="italic"),
    plot.caption=element_text(hjust=0),
    axis.ticks = element_blank(),
    panel.background = element_blank(),axis.title = element_blank(),
    axis.text = element_blank(),
    axis.title.x = element_blank(),
    axis.title.y = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_rect(colour = "black", fill=NA, size=1)
  )
}

plotTheme <- function(base_size = 12) {
  theme(
    text = element_text( color = "black"),
    plot.title = element_text(size = 14,colour = "black"),
    plot.subtitle = element_text(face="italic"),
    plot.caption = element_text(hjust=0),
    axis.ticks = element_blank(),
    panel.background = element_blank(),
    panel.grid.major = element_line("grey80", size = 0.1),
    panel.grid.minor = element_blank(),
    panel.border = element_rect(colour = "black", fill=NA, size=2),
    strip.background = element_rect(fill = "grey80", color = "white"),
    strip.text = element_text(size=12),
    axis.title = element_text(size=12),
    axis.text = element_text(size=10),
    plot.background = element_blank(),
    legend.background = element_blank(),
    legend.title = element_text(colour = "black", face = "italic"),
    legend.text = element_text(colour = "black", face = "italic"),
    strip.text.x = element_text(size = 14)
  )
}

1.Introduction

1.1 Motivation

Last year, 2018, was the deadliest and most destructive wildfire season ever recorded in California, with total of 8,527 fires and 766,439 hectare forest benn burned. From mid-July 2018 to August 2018, a series of large wildfires broke out in California, mainly in northern California, including the destructive Carr Fire and Mendocino Complex Fire. On August 4, 2018, due to a fire in Northern California, Northern California declared a national disaster.

As of February 17, 2019, the new wildfires include the Woolsey and Camp fires, killing at least 85 people and leaving 2 missing.The fire damage by the Firestone fire alone was more than 3.5 billion (2018 USD), including 1.792 billion in fire fighting costs.

For this project, we seek to predict where wildfires are likly to erupt based on characteristics of each site, the weather condition and historical wildfire records and to what degree it is likely to occur.Thereby providing policymakers and planner with the means to be more proactive in their pre-interventions and their fire fighting operations.

The vedio url is https://www.youtube.com/watch?v=qDsn-VwN9a0&t=152s

1.2 Methodology

fire_unit <- st_read('./CA_fire/fishnet_15.shp')
## Reading layer `fishnet_15' from data source `C:\Users\M S I\Desktop\final\CA_fire\fishnet_15.shp' using driver `ESRI Shapefile'
## Simple feature collection with 4312 features and 8 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: -128709.2 ymin: 3599667 xmax: 764189.7 ymax: 4674834
## epsg (SRID):    32611
## proj4string:    +proj=utm +zone=11 +datum=WGS84 +units=m +no_defs
fire_unit$fire_yes <- as.factor(fire_unit$fire_yes)
fire_unit$Unique_id <- as.factor(fire_unit$Unique_id)

fires <- st_read('./CA_fire/fires.shp')%>%
  st_transform(st_crs(fire_unit))
## Reading layer `fires' from data source `C:\Users\M S I\Desktop\final\CA_fire\fires.shp' using driver `ESRI Shapefile'
## Warning in CPL_read_ogr(dsn, layer, query, as.character(options), quiet, : GDAL
## Message 1: organizePolygons() received an unexpected geometry. Either a polygon
## with interior rings, or a polygon with less than 4 points, or a non-Polygon
## geometry. Return arguments as a collection.
## Warning in CPL_read_ogr(dsn, layer, query, as.character(options), quiet, :
## GDAL Message 1: Geometry of polygon of fid 19666 cannot be translated to Simple
## Geometry. All polygons will be contained in a multipolygon.
## Simple feature collection with 20481 features and 19 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: -373237.5 ymin: -604727.6 xmax: 519987.8 ymax: 518283.7
## epsg (SRID):    NA
## proj4string:    +proj=aea +lat_1=34 +lat_2=40.5 +lat_0=0 +lon_0=-120 +x_0=0 +y_0=-4000000 +datum=NAD83 +units=m +no_defs

CA <- st_read('./CA_fire/CA.shp')%>%
  st_transform(st_crs(fire_unit))
## Reading layer `CA' from data source `C:\Users\M S I\Desktop\final\CA_fire\CA.shp' using driver `ESRI Shapefile'
## Simple feature collection with 1 feature and 1 field
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: -128709.2 ymin: 3599667 xmax: 764189.7 ymax: 4674834
## epsg (SRID):    32611
## proj4string:    +proj=utm +zone=11 +datum=WGS84 +units=m +no_defs
fire <- st_read("./CA_fire/fishnet_all.shp")%>%
  st_set_geometry(NULL)
## Reading layer `fishnet_all' from data source `C:\Users\M S I\Desktop\final\CA_fire\fishnet_all.shp' using driver `ESRI Shapefile'
## Simple feature collection with 4312 features and 34 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: -128709.2 ymin: 3599667 xmax: 764189.7 ymax: 4674834
## epsg (SRID):    32611
## proj4string:    +proj=utm +zone=11 +datum=WGS84 +units=m +no_defs
fire$fire_yes <- as.factor(fire$fire_yes)
fire$Unique_ID <- as.numeric(1:4312)

columnName_lst <- list( "Fire_Occur", "Wind_Level","Precipitation","Distance_to_Lakes","Distance_to_Roads",
                       "Distance_to_Urban", "Avg_Groudwater_level", "Groudwater_Change",
                       "Vegetation_type", 
                       "Habitats","Solar_Annual","Solar_Spring","Solar_Summer",
                       "Solar_Fall","Solar_Winter",
                       "Avg_Annual_Temperature", "Fuel_Hazard_Rank","Climate_Exposure", 
                       "Climate_Sensitivity","Elevation","Slope","Temperature_Change",
                       "Population_Density","Fire_Risk","Fire_Count","Aspect","Lasting_Days",
                       "Percent_BurnedArea","County","Vegetation_number","Habitat_number",
                       "Aspect_number","FishnetID")

for (i in 1:length(columnName_lst)) {
  columnName = columnName_lst[[i]]
  names(fire)[i]<-paste(columnName)
}
fire_final <- 
  fire %>%
  mutate(WildFire = as.factor(ifelse(Fire_Occur == 1,"FireOccur","NoFire")))%>%
  select(Fire_Occur,Precipitation,Distance_to_Lakes,Distance_to_Roads,Distance_to_Urban,
         Avg_Groudwater_level,Groudwater_Change,Vegetation_number,Habitat_number,Solar_Annual,Solar_Spring,Solar_Summer,Solar_Fall,Solar_Winter,Avg_Annual_Temperature,Fuel_Hazard_Rank,Climate_Exposure,Climate_Sensitivity,Elevation,Slope,Temperature_Change,Population_Density,Fire_Risk,County,WildFire,
FishnetID) 

Our analysis begins with fire perimeters data, downloaded from Calfire contains fire records in the past 120 years in California. The dataset contains following elements:

YEAR_ - Year when the wildfire occurred

FIRE_NAME - Name of the fire

ALARM_DATE - Date when wildfire occurred

CONT_DATE - Date when wildfire put out

GIS_ACRES - Burned area by acres

DAYS - We transformed in R use lubridate to calculate the lasting days of the wildfire

The first step of the prediction is to aggregate the fire data to state of California. We do so by creating fishnets of 10km*10km size that cover the whole state and clip by CA boundary and erase the waterbodies to get 4312 fishnets.

Then we spatial join the data to grid cells and based on the joint count, we divide the 4312 fishnets into two categories: 1 means there were fires occurred in the past 120 years and 0 indicates that no wildfires ever erupted.

ggplot() +
  geom_sf(data=fire_unit, aes(fill = fire_yes)) +
  labs(title = "Occurrence of Wildfires in California",
       subtitle = 'Categorical Outcome: 1:Yes or 0:No') +
  scale_fill_manual(values = palette2) +
  mapTheme()

2. Data

2.1 Data Collection

For our analysis, all the data we gathered are from open sources like Calfire, USGS, DataBasin and Esri Database. Our goal is to provide linyeuxejia and planners with insight regarding the environment and weather factors which might indicate wildfire eruption. As a result, we had 5 categories in the dataset we created:

Topography - Elevation, Slope, Aspect;

Climate - Precipitation, Wind Level, Annual Solar Resources, Climate Exposure, Climate Sensitivity,Average Temperature in summer, Temperature Change in past 20 years;

Landcover - Vegetation Type, Habitats, Fuel Hazard Rank, Fire Risks, Groundwater Level, Change of Groundwater change in past 20 years;

Socioeconomic - Population Density, Median Household Income;

Infrastructure - Distance to Lakes,Distance to main roads,Distance to Railways,Distance to Urban Area;

Table 2: Data Dictionary

library(kableExtra)


summary <- fire %>%
  as.data.frame()%>%
  dplyr::select("Wind_Level","Precipitation","Fire_Risk","Solar_Spring","Solar_Winter","Solar_Annual","Elevation","Solar_Fall","Avg_Annual_Temperature","Climate_Exposure","Groudwater_Change","Solar_Summer","Population_Density","Slope","Habitat_number","Temperature_Change","Fuel_Hazard_Rank","Climate_Sensitivity","Vegetation_number","Distance_to_Roads","Distance_to_Lakes","Aspect_number")%>%
  dplyr::select_if(is.numeric)

summary_df <- data.frame(matrix(ncol = 4, nrow = 1))
names(summary_df) <-c("Variable","Min","Median","Max")

for (i in names(summary)){
  blah<- c(i, round(summary(summary[[i]]),1))
  summary_df <- rbind(summary_df,blah)
}

summary_df2 <- summary_df %>% filter(!is.na(Variable)) %>%
  mutate(Description = case_when(
                                 Variable == "Precipitation" ~ "Precipitation",
                                 Variable == "Fire_Risk" ~ "Fire Risk)",
                                 Variable == "Solar_Spring" ~ "Solar Springs",
                                 Variable == "Solar_Winter" ~ "Solar Winter",
                                 Variable == "Solar_Annual" ~ "Solar Annual",
                                 Variable == "Solar_Fall" ~ "Solar Fall",
                                  Variable == "Elevation" ~ "Elevation",
                                 Variable == "Avg_Annual_Temperature" ~ "Avg Annual Temperature",
                                 Variable == "Climate_Exposure" ~ "Climate Exposure",
                                 Variable == "Groudwater_Change" ~ "Groudwater Changee",
                                 Variable == "Solar_Summer" ~ "Solar Summer",
                                 Variable == "Population_Density" ~ "Population Density",
                                 Variable == "Slope" ~ "Slope",
                                 Variable == "Habitat_number" ~ "Habitat number",
                                 Variable == "Temperature_Change" ~ "Temperature Change",
                                 Variable == "Fuel_Hazard_Rank" ~ "Fuel Hazard Rank",
                                 Variable == "Climate_Sensitivity" ~ "Climate Sensitivity",
                                 Variable == "Vegetation_number" ~ "Vegetation number",
                                 Variable == "Distance_to_Roads" ~ "Distance to Roads",
                                 Variable == "Distance_to_Lakes" ~ "Distance to Lakes",
                                Variable == "Wind_Level" ~ "Wind Level",
                                Variable == "Aspect_number" ~ "Aspect number"),
         Type = "Continuous",
         Category = case_when(Variable == "Fire_Occur_number" ~ "Outcome Variable",
                                 Variable == "Precipitation" ~ "Climate Characteristic",
                                 Variable == "Fire_Risk" ~ "Spatial Characteristic",
                                 Variable == "Solar_Spring" ~ "Climate Characteristic",
                                 Variable == "Solar_Winter" ~ "Climate Characteristic",
                                 Variable == "Solar_Annual" ~ "Climate Characteristic",
                                 Variable == "Solar_Fall" ~ "Climate Characteristic",
                              Variable == "Elevation" ~ "Spatial Characteristic",
                                 Variable == "Avg_Annual_Temperature" ~ "Climate Characteristic",
                                 Variable == "Climate_Exposure" ~ "Climate Exposure",
                                 Variable == "Groudwater_Change" ~ "Spatial Characteristic",
                                 Variable == "Solar_Summer" ~ "Climate Characteristic",
                                 Variable == "Population_Density" ~ "Demographic Characteristic",
                                 Variable == "Slope" ~ "Spatial Characteristic",
                                 Variable == "Habitat_number" ~ "Demographic Characteristic",
                                 Variable == "Temperature_Change" ~ "Climate Characteristic",
                                 Variable == "Fuel_Hazard_Rank" ~ "Spatial Characteristic",
                                 Variable == "Climate_Sensitivity" ~ "Climate Characteristic",
                                 Variable == "Vegetation_number" ~ "Spatial Characteristicr",
                                 Variable == "Distance_to_Roads" ~ "Infrastructure Characteristic",
                                 Variable == "Distance_to_Lakes" ~ "Infrastructure Characteristic",
                                Variable == "Wind_Level" ~ "Climate Characteristic",
                                Variable == "Aspect_number" ~ "Spatial Characteristic")) %>%
  dplyr::select(Variable, Type, Category, Description, everything()) %>%
  arrange(Category)

table_2<-kable(summary_df2) %>%
  kable_styling(bootstrap_options = "striped", full_width = F)%>%
  scroll_box(width = "100%", height = "700px")

table_2
Variable Type Category Description Min Median Max
Wind_Level Continuous Climate Characteristic Wind Level 1 1 1
Precipitation Continuous Climate Characteristic Precipitation 2.5 7.5 17.5
Solar_Spring Continuous Climate Characteristic Solar Springs 4.8 6.1 6.5
Solar_Winter Continuous Climate Characteristic Solar Winter 2.9 3.7 4.2
Solar_Annual Continuous Climate Characteristic Solar Annual 4.4 5.7 6
Solar_Fall Continuous Climate Characteristic Solar Fall 4.2 5.8 6.1
Avg_Annual_Temperature Continuous Climate Characteristic Avg Annual Temperature -3.8 11.5 15.3
Solar_Summer Continuous Climate Characteristic Solar Summer 5.2 7.1 7.3
Temperature_Change Continuous Climate Characteristic Temperature Change -1 0.4 0.6
Climate_Sensitivity Continuous Climate Characteristic Climate Sensitivity -1 -0.6 -0.2
Climate_Exposure Continuous Climate Exposure Climate Exposure -0.2 0.2 0.2
Population_Density Continuous Demographic Characteristic Population Density 0 390.7 513.5
Habitat_number Continuous Demographic Characteristic Habitat number 1 37 83
Distance_to_Roads Continuous Infrastructure Characteristic Distance to Roads 0 1627.8 4748.9
Distance_to_Lakes Continuous Infrastructure Characteristic Distance to Lakes 0 13781.5 24459.5
Fire_Risk Continuous Spatial Characteristic Fire Risk) -2 1 1
Elevation Continuous Spatial Characteristic Elevation -82 217 683
Groudwater_Change Continuous Spatial Characteristic Groudwater Changee -0.4 0 0.1
Slope Continuous Spatial Characteristic Slope 0 0.6 1.8
Fuel_Hazard_Rank Continuous Spatial Characteristic Fuel Hazard Rank -1 1 1
Aspect_number Continuous Spatial Characteristic Aspect number 1 2 5
Vegetation_number Continuous Spatial Characteristicr Vegetation number 1 43 89

2.2 Data Wrangling

Most part of data wrangling was done in ArcGIS. We imported all data we gathered into personal geodatabase and use model builder to project them into projected coordinate system: UTM Zone 11N.

Topography: We downloaded the 30m*30m DEM file of California and processed in ArcGIS to get slope and aspect data. To require the average value of elevation, aspect and slope of each fishnet, zonal statistic tool was used. And the aspect value was reclassified into 9 categories of aspect.

Climate: All climate data were vectors and to assign their values to fishnets, Feature to points tool was used to get center points of fishnets and spatial join the climate value to these points.

Landcover: Similar to previous steps, zonal statistics and spatial join was used to get the attributes or values of each fishnet.

Socioeconomic: Wen wrangled population and median household data of all counties in the state and we created IDW (Inverse Distance Weighted) interpolation surface and use zonal statiswetics function to obtain average median household income and population density by dividing total population by area of each fishnets.

Infrastructure: We use Euclidean Distance to create distance raster files of lakes, railways, major roads and urban regions and use zonal statistics to calculate the mean distance of each fishnet to these 4 variables.

3. Exploratory Analysis

3.1 Wildfire distribution

In this section, we first look at the distribution of wildfires in each county, namely, the count of wildfires in the past 20 years.This graph gives us a whole picture of the distribution of our dependent variable. Top 5 counties, beside Siskiyou, all other four locate in Southern California. San bernardina has the most wildfires with nearly 600 and the following three couties are Inyo, Kern and Riverside, but all of them have less than half of filre count of San Bernardino.

fire_final %>%
  group_by(County) %>%
  summarize(count = n()) %>%
  ggplot(aes(reorder(County, -count), count)) +
    geom_bar(stat = "identity", colour = "white", fill="#3B668C") +
    labs(title = "Count of wildfire of each county, CA",
         x="County", y="Count") +
    plotTheme() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))

3.2 Temporal Analysis

Now, we aggregate the months into four seasons, and explore the wildfire characteristics of each season. Our exploration is based on that wildfire is highly influenced by the natural weather. Some typical weather, like the long-lasting drought and high temperature, have a boost effect on the wildfire eruption:

Dec - Feb: Winter;

Mar - May: Spring;

Jun - Aug: Summer;

Sept - Nov: Fall.

We examined three characteristics of wildfires: wildfire count, lasting days of fires and burned area(acres). From figure below, we observe that summer and fall have the most serious wildfire conditions in the year. Wildfires erupt in summer and fall last 12 days and with 4500 acres of burned area on average, but summer has the most fire count with nearly 250 which is 2.5 times of fall and 5 tiems of winter.

library(lubridate)
fire_18 <- 
  fires %>%
  st_set_geometry(NULL) %>%
  mutate(Fire_Counter = 1) %>%
  filter(YEAR_ == 2018)%>%
  mutate(month = month(ALARM_DATE))%>%
  mutate(month.cat = case_when(
                  month >= 3 & month<= 5  ~ "Spring",
                  month >= 6 & month <= 8  ~ "Summer",
                  month >= 9 & month <= 11  ~ "Fall",
                  month >= 12 | month <= 2  ~ "Winter"))%>%
  select(Fire_Counter,month.cat,Days,GIS_ACRES)%>%
  na.omit()%>%
  group_by(month.cat)%>%
  summarize(Fire_Count = sum(Fire_Counter),
            Lasting_days = mean(Days),
            Burned_area = mean(GIS_ACRES))%>%
  select(month.cat,Fire_Count,Lasting_days,Burned_area)

  
fire_18 %>%
    dplyr::select(month.cat,Fire_Count,Lasting_days,Burned_area) %>%
    gather(Variable, value, -month.cat) %>%
      ggplot(aes(Variable, value, fill = month.cat, order = month.cat)) +   
        geom_bar(position = "dodge", stat="identity") +
        facet_wrap(~Variable, scales="free") +
        scale_fill_manual(values = palette4) +
        labs(x="Seasons", y="Value",
             title = "Wildfire Seansonality",
             subtitle = "Three category features") +
        theme(axis.text.x = element_text(angle = 0, hjust = 1))

3.3 Spatial Analysis

The map delivers three information:

Wildfires are obviously spatial clustered on mountaineous region around California central valley and in Sounthern California. Vegetation and land cover types has to do with wildfire eruption like Desert Dry Wash Woodland areas have less fires than juniper-covered areas. Fires in Northern California are more dispersed than in Southern part. This indicates us when doing the prediction, we should take the spatial difference into consideration.

ggplot() + 
  geom_sf(data = CA, colour = "white", fill = "grey") +
  geom_sf(data = fires, colour='#8C030E', fill = '#8C030E') +
  labs(title= "Wildfires, California - 1998 to 2018 ") +
  mapTheme()

Figure below shows the each grid cell’s characteristics of wildfire: count of fires in past 20 years, fire lasting days and aggregate bunred area(0 - 100 acres). It validates what we discussed for the previous figure:

Southern California has a dense wildfire eruption, especially in Santa Barbara and Los Angeles County; Fires of long-lasting days and more burned area occur at remote national parks or mountainous regions.Though most fishnets do not have conspicuous fire patterns, the cumulative burned area deserves our concern. More importantly, in past 20 years, many fishnets have been burned 100% which need us to pay more attention to these areas.

vars_net.long <- 
  fire_unit %>%
  dplyr::select(fire_count:pct_fire)%>%
  gather(Variable, value, -geometry)

vars <- unique(vars_net.long$Variable)
mapList <- list()

for(i in vars){
  mapList[[i]] <- 
    ggplot() +
      geom_sf(data = filter(vars_net.long, Variable == i), aes(fill=value), colour="NA") +
      scale_fill_gradient(low="#3b668c",high="#F2bf27")+
      labs(title=i) +
      mapTheme()}

do.call(grid.arrange,c(mapList, ncol =3, top = "Wildfire Characteristics across Fishnet"))

3.4 Feature Associations With wildfire

Our hypothesis is that wildfire can be predicted conditional on the multiple informations within each fishnet. For the predict outcome is binaral: fire occur or no fire.Scatterplot is not the best way to test a correlation between the dependent variable and continuous features like elevation. The below bar plots shows the mean for all continuous features grouped by fire or No_fire. The interpretation is thatwildfire has more to do with topographical features like slope, elevation and precipitation and climate sensitivity.

fire %>%
  dplyr::select(Fire_Occur,Precipitation,Distance_to_Lakes,Distance_to_Roads,Distance_to_Urban,
                Avg_Groudwater_level,Groudwater_Change,Solar_Annual,Solar_Spring,Solar_Summer,Solar_Fall,
                Solar_Winter,Avg_Annual_Temperature,Climate_Exposure,Climate_Sensitivity,Elevation,Slope,
                Temperature_Change,Population_Density) %>%
  gather(Variable, value, -Fire_Occur) %>%
    ggplot(aes(Fire_Occur, value, fill=Fire_Occur)) + 
      geom_bar(position = "dodge", stat = "summary", fun.y = "mean") + 
      facet_wrap(~Variable, scales = "free") +
      scale_fill_manual(values = palette2) +
      labs(x="Fire_Occur", y="Value", 
           title = "Feature associations with the likelihood of Fire_Occur",
           subtitle = "(continous outcomes)") +
      theme(legend.position = "none")

Not only is wildfire categorical, most of the features are as well. The plots below illustrate whether differences in aspect, fire risk level, fuel harzard rank and wind level.The interpretation is that more fire occur at southwest and west aspect, fuel hazard ranked 2 and wind level art the lowest.

fire %>%
    dplyr::select(Fire_Occur, Wind_Level,Fuel_Hazard_Rank,Fire_Risk,Aspect) %>%
    gather(Variable, value, -Fire_Occur) %>%
    count(Variable, value, Fire_Occur) %>%
      ggplot(., aes(value, n, fill = Fire_Occur)) +   
        geom_bar(position = "dodge", stat="identity") +
        facet_wrap(~Variable, scales="free") +
        scale_fill_manual(values = palette2) +
        labs(x="Fire_Occur", y="Value",
             title = "Feature associations with the likelihood of fire",
             subtitle = "Multi category features") +
       plotTheme()
## Warning: attributes are not identical across measure variables;
## they will be dropped

Then the below two barplots shows the associattion of main vegetation and land cover types with likelihood of wildfire.In fishnets without fire occurence, most have their main vegetation type of Mojave Creosote Bush Scrub or simply speaking, a desert scrub. As for fishnets with fire occurence, the top 1 vegetation type is Non-native Grassland and follows mixed coniferous forest.

fire %>%
  group_by(Fire_Occur,Vegetation_type) %>%
  summarize(count = n()) %>%
  arrange(-count)%>%
  slice(1:10)%>%
  ggplot(.,aes(reorder(Vegetation_type,count,FUN = max),count,fill = Fire_Occur))+
  geom_bar(position = "dodge", stat="identity") +
        facet_wrap(~Fire_Occur, scales="free") +
        scale_fill_manual(values = palette2) +
        labs(x="Fire_Occur", y="Value",
             title = "Feature associations with the likelihood of fire",
             subtitle = "Top 10 Vegetation Types") +
        theme(axis.text.x = element_text(angle = 45, hjust = 1),
              legend.position = "none")

As for results of land cover type, it consents to results of vegetation which indicates wildfires are clustered at regions covered by grassland, coniferous forest and blue oak woods.

fire %>%
  group_by(Fire_Occur,Habitats) %>%
  summarize(count = n()) %>%
  arrange(-count)%>%
  slice(1:10)%>%
  ggplot(.,aes(reorder(Habitats,count,FUN = max),count,fill = Fire_Occur))+
  geom_bar(position = "dodge", stat="identity") +
        facet_wrap(~Fire_Occur, scales="free") +
        scale_fill_manual(values = palette2) +
        labs(x="Fire_Occur", y="Value",
             title = "Feature associations with the likelihood of fire",
             subtitle = "Top 10 Land Cover Types") +
        theme(axis.text.x = element_text(angle = 45, hjust = 1),
              legend.position = "none")

4. Feature Summary and Engineering

4.1 Feature Selection

Feature selection is crucial in predictive modeling. We choose Boruta which uses random forest algorithm to generate thye importance of each feature.

library(Boruta)
featureDF <- fire %>%
  dplyr::select(1:32,-9,-10,-25,-26,-27,-28,-29,-33) 

set.seed(123)
borutaTrain <- Boruta(Fire_Occur ~., data = featureDF, doTrace = 2)
#print(borutaTrain)
borutaTrainDF <- attStats(borutaTrain)
#get final selection 
finalBoruta <- TentativeRoughFix(borutaTrain)
## Warning in TentativeRoughFix(borutaTrain): There are no Tentative attributes!
## Returning original object.
#print(finalBoruta)
borutaFinalDF <- attStats(finalBoruta)

The result of the Boruta feature selection is shown as below. Blue boxplots correspond to minimal, average and maximum Z score of a shadow attribute.

plot(finalBoruta, xlab = "", xaxt = "n")
lz<-lapply(1:ncol(finalBoruta$ImpHistory),function(i)
  finalBoruta$ImpHistory[is.finite(finalBoruta$ImpHistory[,i]),i])
names(lz) <- colnames(finalBoruta$ImpHistory)
Labels <- sort(sapply(lz,median))
axis(side = 1,las=2,labels = names(Labels),
     at = 1:ncol(finalBoruta$ImpHistory), cex.axis = 0.7)

The figure below shows the mean importance of each feature.

Topography <- borutaFinalDF[c("Elevation","Slope","Aspect_number"),]
socia <- borutaFinalDF[c("Distance_to_Lakes","Distance_to_Roads","Distance_to_Urban",'Population_Density'),]
climate <- borutaFinalDF[c("Wind_Level","Precipitation","Solar_Annual","Solar_Spring","Solar_Summer","Solar_Fall","Solar_Winter","Avg_Annual_Temperature","Climate_Exposure","Climate_Sensitivity","Temperature_Change"),]
vege <- borutaFinalDF[c("Fuel_Hazard_Rank","Vegetation_number","Habitat_number","Avg_Groudwater_level","Groudwater_Change","Fire_Risk"),]
Topography <- Topography %>% 
  select(meanImp)%>%
  mutate(variable = row.names(Topography)) %>%
  as.data.frame()

socia <- socia %>% 
  select(meanImp)%>%
  mutate(variable = row.names(socia)) %>%
  as.data.frame()

climate <- climate %>% 
  select(meanImp)%>%
  mutate(variable = row.names(climate)) %>%
  as.data.frame()

vege <- vege %>% 
  select(meanImp)%>%
  mutate(variable = row.names(vege)) %>%
  as.data.frame()
palette12 <- c("#8C566A","#537FA6","#F2BF27","#F2A341","#BF7449",
               "#49868C","#A0D9D9","#D9B166","#BF9969","#A62D2D","#D92B04","#A61103")
chart1 <- ggplot(Topography, aes(x=reorder(variable, -meanImp), 
                                       y = meanImp, fill = variable)) + 
  labs(x = "variable", y = "meanImp") +
  geom_bar(stat = "identity") + 
  theme(legend.position = "none", 
        axis.text.x = element_text(angle = 0, hjust = 0)) +
  labs(title = "Variable Importance of Topography") + 
  scale_fill_manual(values = palette12)  +
        theme(axis.text.x = element_text(angle = 45, hjust = 1),
              legend.position = "none") +
  plotTheme()

chart2 <- ggplot(socia, aes(x=reorder(variable, -meanImp), 
                                       y = meanImp, fill = variable)) + 
  labs(x = "variable", y = "meanImp") +
  geom_bar(stat = "identity") + 
  theme(legend.position = "none", 
        axis.text.x = element_text(angle = 0, hjust = 0)) +
  labs(title = "Variable Importance of Socioeconomic") + 
  scale_fill_manual(values = palette12)  +
        theme(axis.text.x = element_text(angle = 45, hjust = 1),
              legend.position = "none") +
  plotTheme()

chart3 <- ggplot(climate, aes(x=reorder(variable, -meanImp), 
                                       y = meanImp, fill = variable)) + 
  labs(x = "variable", y = "meanImp") +
  geom_bar(stat = "identity") + 
  theme(legend.position = "none", 
        axis.text.x = element_text(angle = 0, hjust = 0)) +
  labs(title = "Variable Importance of Climate") + 
  scale_fill_manual(values = palette12)  +
        theme(axis.text.x = element_text(angle = 45, hjust = 1),
              legend.position = "none") +
  plotTheme()

chart4 <- ggplot(vege, aes(x=reorder(variable, -meanImp), 
                                       y = meanImp, fill = variable)) + 
  labs(x = "variable", y = "meanImp") +
  geom_bar(stat = "identity") + 
  theme(legend.position = "none", 
        axis.text.x = element_text(angle = 0, hjust = 0)) +
  labs(title = "Variable Importance of Vegetation") + 
  scale_fill_manual(values = palette12)  +
        theme(axis.text.x = element_text(angle = 45, hjust = 1),
              legend.position = "none") +
  plotTheme()

grid.arrange(chart1, chart2, chart3, chart4, ncol = 2, 
             top = "Mean Variable Importance of Boruta Feature Selection")

This table shows the result of Boruta selection with the column “decision”, we can decide wich features to select fro latter prediction: besides aspect and wind_level.

table_boruta<-kable(borutaFinalDF) %>%
  kable_styling(bootstrap_options = "striped", full_width = F)%>%
  scroll_box(width = "100%", height = "500px")

table_boruta
meanImp medianImp minImp maxImp normHits decision
Wind_Level 3.971956 4.067652 2.630047 4.978089 1 Confirmed
Precipitation 36.649457 36.426033 35.176268 39.970735 1 Confirmed
Distance_to_Lakes 19.301713 18.902350 17.667848 21.144870 1 Confirmed
Distance_to_Roads 13.151709 12.848610 11.361752 15.102825 1 Confirmed
Distance_to_Urban 21.710234 21.593847 20.373553 22.846396 1 Confirmed
Avg_Groudwater_level 20.028647 20.016232 18.456828 21.625018 1 Confirmed
Groudwater_Change 22.761249 22.419512 21.178975 24.570583 1 Confirmed
Solar_Annual 22.628730 22.498992 21.608236 24.157207 1 Confirmed
Solar_Spring 26.394168 26.335937 25.027784 27.676183 1 Confirmed
Solar_Summer 21.460636 21.422214 20.652230 22.898686 1 Confirmed
Solar_Fall 24.608209 24.607010 22.928526 26.166870 1 Confirmed
Solar_Winter 23.122074 23.030436 22.371562 24.005084 1 Confirmed
Avg_Annual_Temperature 24.536083 24.598776 23.386544 25.725468 1 Confirmed
Fuel_Hazard_Rank 15.871112 15.890458 14.956789 16.903042 1 Confirmed
Climate_Exposure 23.332001 23.488110 21.318109 25.569437 1 Confirmed
Climate_Sensitivity 16.921231 16.926835 15.636236 18.390014 1 Confirmed
Elevation 33.234178 33.307982 31.840272 34.604660 1 Confirmed
Slope 26.581987 26.629220 24.802448 28.158753 1 Confirmed
Temperature_Change 15.539627 15.322199 14.459083 16.672284 1 Confirmed
Population_Density 24.651651 24.722769 22.946948 25.806152 1 Confirmed
Fire_Risk 32.828346 32.901110 31.671734 34.190022 1 Confirmed
Vegetation_number 14.879138 15.080391 12.957855 16.363037 1 Confirmed
Habitat_number 13.284718 13.307986 12.049727 14.169845 1 Confirmed
Aspect_number 7.388944 7.241782 5.689013 9.263658 1 Confirmed

4.2 Model Building

In this section, we developed two prediction models, one with county and one without county. We want to explore whether there is a difference between wildfires risks in different counties.

Due to the sample size limitation among different counties, in this model we only consider the top five counties of wildfire count: San Bernardino, Inyo, Kern, Riverside and Siskiyou County.

We split the data into two sets: training set is data in the past 17 years and test set is data in the past 3 years. For each test set permutation, we calculate the area under the three advantages (ROC), curve of the fitted indicator (AUC), sensitivity, and specificity.

set.seed(3456)
trainIndex <- createDataPartition(fire_final$Vegetation_number, p = .70,
                                  list = FALSE,
                                  times = 1)
fireTrain <- fire_final[ trainIndex,]
fireTest  <- fire_final[-trainIndex,]
Model_withCounty <- train(WildFire ~ ., data = fireTrain %>%
                                               select(-Fire_Occur,
                                                      -FishnetID),
 family="binomial"(link="logit"), method="glm", 
 metric="ROC",
 trControl = trainControl(method = "cv",number = 40,classProbs = TRUE, summaryFunction=twoClassSummary))

Model_noCounty <- train(WildFire ~ ., data = fireTrain %>%
                                               select(-County,
                                                      -Fire_Occur,
                                                      -FishnetID),
 family="binomial"(link="logit"), method="glm", 
 metric="ROC",
 trControl = trainControl(method = "cv",number = 40,classProbs = TRUE, summaryFunction=twoClassSummary))
testProbs <- 
  rbind(
    data.frame(Outcome = fireTest$Fire_Occur,
               probs = predict(Model_withCounty, fireTest, type="prob")$FireOccur,
               predOutcome  = ifelse(predict(Model_withCounty, fireTest, type="prob")$FireOccur > 0.5 , 1, 0),
               regression = "With County",
               County = fireTest$County),
    data.frame(Outcome = fireTest$Fire_Occur,
               probs = predict(Model_noCounty, fireTest, type="prob")$FireOccur,
               predOutcome  = ifelse(predict(Model_noCounty, fireTest, type="prob")$FireOccur > 0.5 , 1, 0),
               regression = "No County",
               County = fireTest$County))

head(testProbs)
##   Outcome     probs predOutcome  regression    County
## 1       1 0.5325527           1 With County San Diego
## 2       1 0.9882197           1 With County San Diego
## 3       1 0.9939973           1 With County San Diego
## 4       0 0.7566670           1 With County San Diego
## 5       1 0.7708105           1 With County San Diego
## 6       1 0.9930569           1 With County San Diego

4.3 Evaluating Accuracy

To choose the apropriate threshold, we calculate the accuracy under each threshold and choose the one with the highest accuracy ((True Positive + True Negative)/ total outcome), as the plot shows, when threshold is 0.51, we got the highest accuracy of 0.8743.

In this section we build a number of plots and tables that try to compare across the county and no county regressions to see whether the inclusion of county affect the result.

iterateThresholds <- function(data) {
  x = .01
  all_prediction <- data.frame()
  while (x <= 1) {
  
  this_prediction <-
      testProbs %>%
      mutate(predOutcome = ifelse(probs > x, 1, 0)) %>%
      count(predOutcome, Outcome) %>%
      summarize(sum_trueNeg = sum(n[predOutcome==0 & Outcome==0]),
                sum_truePos = sum(n[predOutcome==1 & Outcome==1]),
                sum_falseNeg = sum(n[predOutcome==0 & Outcome==1]),
                sum_falsePos = sum(n[predOutcome==1 & Outcome==0]),
            total=sum(n)) %>%
    mutate(True_Positive = sum_truePos / total,
         True_Negative = sum_trueNeg / total,
         False_Negative = sum_falseNeg / total,
         False_Positive = sum_falsePos / total,
         Accuracy = (sum_truePos + sum_trueNeg) / total, Threshold = x)
  
  all_prediction <- rbind(all_prediction, this_prediction)
  x <- x + .01
  }
return(all_prediction)
}
whichThreshold <- iterateThresholds(testProbs)
whichThreshold <- whichThreshold[6:11]

table_threshold<-kable(whichThreshold) %>%
  kable_styling(bootstrap_options = "striped", full_width = F)%>%
  scroll_box(width = "100%", height = "500px")

table_threshold
True_Positive True_Negative False_Negative False_Positive Accuracy Threshold
0.6647332 0.0000000 0.0003867 0.3348801 0.6647332 0.01
0.6647332 0.0011601 0.0003867 0.3337200 0.6658933 0.02
0.6631864 0.0058005 0.0019335 0.3290797 0.6689869 0.03
0.6627997 0.0092807 0.0023202 0.3255994 0.6720804 0.04
0.6624130 0.0146945 0.0027069 0.3201856 0.6771075 0.05
0.6612529 0.0216551 0.0038670 0.3132251 0.6829080 0.06
0.6604795 0.0301624 0.0046404 0.3047177 0.6906419 0.07
0.6593194 0.0456303 0.0058005 0.2892498 0.7049497 0.08
0.6593194 0.0560712 0.0058005 0.2788090 0.7153906 0.09
0.6585460 0.0703790 0.0065739 0.2645012 0.7289250 0.10
0.6558391 0.0823666 0.0092807 0.2525135 0.7382057 0.11
0.6535189 0.0912606 0.0116009 0.2436195 0.7447796 0.12
0.6519722 0.1017015 0.0131477 0.2331787 0.7536736 0.13
0.6500387 0.1125290 0.0150812 0.2223511 0.7625677 0.14
0.6477185 0.1198763 0.0174014 0.2150039 0.7675947 0.15
0.6473318 0.1291570 0.0177881 0.2057231 0.7764888 0.16
0.6457850 0.1384377 0.0193349 0.1964424 0.7842227 0.17
0.6450116 0.1461717 0.0201083 0.1887084 0.7911833 0.18
0.6411446 0.1569992 0.0239753 0.1778809 0.7981439 0.19
0.6395978 0.1662800 0.0255220 0.1686002 0.8058778 0.20
0.6365043 0.1736272 0.0286156 0.1612529 0.8101315 0.21
0.6349575 0.1802011 0.0301624 0.1546790 0.8151585 0.22
0.6341841 0.1875483 0.0309358 0.1473318 0.8217324 0.23
0.6341841 0.1948956 0.0309358 0.1399845 0.8290797 0.24
0.6322506 0.2006961 0.0328693 0.1341841 0.8329466 0.25
0.6314772 0.2072699 0.0336427 0.1276102 0.8387471 0.26
0.6299304 0.2115236 0.0351895 0.1233565 0.8414540 0.27
0.6283836 0.2157773 0.0367363 0.1191029 0.8441609 0.28
0.6268368 0.2188708 0.0382831 0.1160093 0.8457077 0.29
0.6249033 0.2227378 0.0402166 0.1121423 0.8476411 0.30
0.6229698 0.2266048 0.0421500 0.1082753 0.8495746 0.31
0.6210363 0.2289250 0.0440835 0.1059551 0.8499613 0.32
0.6198763 0.2327920 0.0452436 0.1020882 0.8526682 0.33
0.6171694 0.2389791 0.0479505 0.0959010 0.8561485 0.34
0.6156226 0.2416860 0.0494973 0.0931941 0.8573086 0.35
0.6136891 0.2447796 0.0514308 0.0901005 0.8584687 0.36
0.6125290 0.2478732 0.0525909 0.0870070 0.8604022 0.37
0.6113689 0.2498067 0.0537510 0.0850735 0.8611756 0.38
0.6098221 0.2521268 0.0552978 0.0827533 0.8619490 0.39
0.6078886 0.2540603 0.0572312 0.0808198 0.8619490 0.40
0.6067285 0.2575406 0.0583913 0.0773395 0.8642691 0.41
0.6059551 0.2598608 0.0591647 0.0750193 0.8658159 0.42
0.6044084 0.2629544 0.0607115 0.0719258 0.8673627 0.43
0.6024749 0.2652746 0.0626450 0.0696056 0.8677494 0.44
0.6009281 0.2691415 0.0641918 0.0657386 0.8700696 0.45
0.5993813 0.2718484 0.0657386 0.0630317 0.8712297 0.46
0.5986079 0.2745553 0.0665120 0.0603248 0.8731632 0.47
0.5974478 0.2757154 0.0676721 0.0591647 0.8731632 0.48
0.5951276 0.2784223 0.0699923 0.0564578 0.8735499 0.49
0.5931941 0.2799691 0.0719258 0.0549111 0.8731632 0.50
0.5924207 0.2819026 0.0726991 0.0529776 0.8743233 0.51
0.5893271 0.2834493 0.0757927 0.0514308 0.8727765 0.52
0.5893271 0.2842227 0.0757927 0.0506574 0.8735499 0.53
0.5858469 0.2865429 0.0792730 0.0483372 0.8723898 0.54
0.5835267 0.2880897 0.0815932 0.0467904 0.8716164 0.55
0.5827533 0.2880897 0.0823666 0.0467904 0.8708430 0.56
0.5812065 0.2892498 0.0839134 0.0456303 0.8704563 0.57
0.5804331 0.2892498 0.0846868 0.0456303 0.8696829 0.58
0.5792730 0.2907966 0.0858469 0.0440835 0.8700696 0.59
0.5757927 0.2923434 0.0893271 0.0425367 0.8681361 0.60
0.5738592 0.2927301 0.0912606 0.0421500 0.8665893 0.61
0.5719258 0.2938902 0.0931941 0.0409899 0.8658159 0.62
0.5711524 0.2958237 0.0939675 0.0390565 0.8669760 0.63
0.5692189 0.2962104 0.0959010 0.0386698 0.8654292 0.64
0.5676721 0.2973705 0.0974478 0.0375097 0.8650425 0.65
0.5653519 0.2993039 0.0997680 0.0355762 0.8646558 0.66
0.5630317 0.2996906 0.1020882 0.0351895 0.8627224 0.67
0.5603248 0.3004640 0.1047951 0.0344161 0.8607889 0.68
0.5568445 0.3020108 0.1082753 0.0328693 0.8588554 0.69
0.5533643 0.3027842 0.1117556 0.0320959 0.8561485 0.70
0.5498840 0.3039443 0.1152359 0.0309358 0.8538283 0.71
0.5487239 0.3051044 0.1163960 0.0297757 0.8538283 0.72
0.5456303 0.3058778 0.1194896 0.0290023 0.8515081 0.73
0.5433101 0.3058778 0.1218097 0.0290023 0.8491879 0.74
0.5409899 0.3058778 0.1241299 0.0290023 0.8468677 0.75
0.5390565 0.3070379 0.1260634 0.0278422 0.8460944 0.76
0.5371230 0.3078113 0.1279969 0.0270688 0.8449343 0.77
0.5344161 0.3093581 0.1307038 0.0255220 0.8437742 0.78
0.5297757 0.3097448 0.1353442 0.0251353 0.8395205 0.79
0.5247486 0.3109049 0.1403712 0.0239753 0.8356535 0.80
0.5197216 0.3120650 0.1453983 0.0228152 0.8317865 0.81
0.5166280 0.3124517 0.1484919 0.0224285 0.8290797 0.82
0.5108275 0.3132251 0.1542923 0.0216551 0.8240526 0.83
0.5046404 0.3139985 0.1604795 0.0208817 0.8186388 0.84
0.4984532 0.3147718 0.1666667 0.0201083 0.8132251 0.85
0.4926527 0.3155452 0.1724671 0.0193349 0.8081980 0.86
0.4853055 0.3163186 0.1798144 0.0185615 0.8016241 0.87
0.4767981 0.3170920 0.1883217 0.0177881 0.7938902 0.88
0.4682908 0.3182521 0.1968291 0.0166280 0.7865429 0.89
0.4609435 0.3186388 0.2041763 0.0162413 0.7795824 0.90
0.4489559 0.3197989 0.2161640 0.0150812 0.7687548 0.91
0.4331013 0.3205723 0.2320186 0.0143078 0.7536736 0.92
0.4199536 0.3213457 0.2451663 0.0135344 0.7412993 0.93
0.4037123 0.3228925 0.2614076 0.0119876 0.7266048 0.94
0.3801237 0.3240526 0.2849961 0.0108275 0.7041763 0.95
0.3549884 0.3248260 0.3101315 0.0100541 0.6798144 0.96
0.3194122 0.3275329 0.3457077 0.0073473 0.6469451 0.97
0.2617943 0.3302398 0.4033256 0.0046404 0.5920340 0.98
0.1647332 0.3321732 0.5003867 0.0027069 0.4969064 0.99
whichThreshold %>%
  ggplot(.,aes(Threshold, Accuracy)) +
  geom_point() +    
  labs(title = "Accuracy by confusion matrix type and threshold cut off",
       y="Accuracy") +
    geom_vline(aes(xintercept = 0.51), colour = "#3b668c", linetype = 3, size = 1.5) +
  guides(colour=guide_legend(title="Confusion Matrix"))

Figure below shows the distribution of predicted probabilities for each observed outcome, Fire and No_Fire, recoded as 1 and 0, respectively.

The ??hump?? of predicted probabilities for Fire cluster around 1 on the x-axis, while the predicted probabilities for No_Fire cluster around 0.0. This means our model performs well.

ggplot(testProbs, aes(x = probs, fill = as.factor(Outcome))) + 
  geom_density() +
  facet_grid(Outcome ~ .) +
  scale_fill_manual(values = palette2) +
  labs(x = "Fire_Occur", y = "Density of probabilities",
       title = "Distribution of predicted probabilities by observed outcome") +
  theme(strip.text.x = element_text(size = 18),
        legend.position = "none")

Then We build ROC curves for the no county regression by filtering for predicted probabilities with regression label of No Fire. Then we create a second ROC curve for the county regression. AUC (area under the curve) assesses the trade-off between sensitivity and specificity, with values close to a value (maximum value) better predicting fire and non-fire. Our final model shows a good degree of goodness of fit, an AUC of 0.9179, and a good discrimination of prediction probability.And we can see that there’s very tiny difference between AUC of these two models which indicates wildfires across different counties in CA do not have distinct difference.

roc.noCounty <-
  ggplot(testProbs %>% filter(regression == "No County"), 
         aes(d = as.numeric(Outcome), m = probs)) + 
  geom_roc(n.cuts = 51, labels = FALSE,colour = "#3b668c") + 
  style_roc(theme = theme_grey) +
  geom_abline(slope = 1, intercept = 0, size = 1.5, color = 'grey') +
  labs(title=paste("Regression without county: AUC =", 
                    testProbs %>% 
                     group_by(regression) %>% 
                     summarize(AUC = auc(Outcome,probs)) %>% 
                     filter(regression == "No County") %>% 
                     pull(2) %>% as.character() %>% substr(.,1,4))) +
  plotTheme()
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc.withCounty <-
  ggplot(testProbs %>% filter(regression == "With County"), 
         aes(d = as.numeric(Outcome), m = probs)) + 
  geom_roc(n.cuts = 51, labels = FALSE, colour = "#3b668c") + 
  style_roc(theme = theme_grey) +
  geom_abline(slope = 1, intercept = 0, size = 1.5, color = 'grey') +
  labs(title=paste("Regression with county: AUC =", 
                    testProbs %>% 
                     group_by(regression) %>% 
                     summarize(AUC = auc(Outcome,probs)) %>% 
                     filter(regression == "With County") %>% 
                     pull(2) %>% as.character() %>% substr(.,1,4))) +
  plotTheme()  
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
grid.arrange(roc.noCounty,roc.withCounty,
  top=textGrob("ROC Curves by model specification", gp=gpar(fontsize=20)))  
## Warning in verify_d(data$d): D not labeled 0/1, assuming 1 = 0 and 2 = 1!

## Warning in verify_d(data$d): D not labeled 0/1, assuming 1 = 0 and 2 = 1!

4.4 Evaluating Generalizability

To evaluate the generalizibility of our model, we compared the results of top 4 counties of fire count.When viewing the ROC curve by top 4 counties, we found that it has very similar curves, which shows that no matter which county is, the performance is comparable.

ggplot(testProbs %>% filter(County == "San Bernardino" | County == "Inyo" | County == "Kern"| County == "Riverside"| County == "Fresno" ), 
       aes(d = as.numeric(Outcome), m = probs, group=County, colour=County)) + 
  geom_roc(n.cuts = 51, labels = FALSE)  +
  scale_color_manual(values = palette5)+
  style_roc(theme = theme_grey) +
  facet_wrap(~regression,ncol=1) +
  geom_abline(slope = 1, intercept = 0, size = 1.5, color = 'grey') +
  labs(title="ROC Curves by County and model specification") +
  plotTheme()
## Warning in verify_d(data$d): D not labeled 0/1, assuming 1 = 0 and 2 = 1!

The perfect result we expect is there’s no false positive and false negative results and with accuracy of 100%. Since perfect predictions do not exist, we focuse on the differences in the wrong predictions between couties to evaluate the generalizibility.

Fire Risk Table:

library(kableExtra)
fire_risk_table <-
   testProbs %>%
      count(predOutcome, Outcome) %>%
      summarize(True_Negative = sum(n[predOutcome==0 & Outcome==0]),
                True_Positive = sum(n[predOutcome==1 & Outcome==1]),
                False_Negative = sum(n[predOutcome==0 & Outcome==1]),
                False_Positive = sum(n[predOutcome==1 & Outcome==0])) %>%
       gather(Variable, Count)%>%
    bind_cols(data.frame(Description = c(
              "We predicted no fire and we didn't see a fire",
              "We predicted fire and saw a fire",
              "We predicted no fire and fire occured",
              "We predicted fire and fire did not occured")))

kable(fire_risk_table,
       caption = "fire risk Table") %>% 
      kable_styling()
fire risk Table
Variable Count Description
True_Negative 724 We predicted no fire and we didn’t see a fire
True_Positive 1534 We predicted fire and saw a fire
False_Negative 186 We predicted no fire and fire occured
False_Positive 142 We predicted fire and fire did not occured

As show in the table above, false negative represents we predict no fire will erupt in the fishnet but actually fire occurs which represents a missing opportunity to prepare for fire fighting. If these missed opportunities are not equally distributed across the county, the algorithm will generate disastrous outcomes.

The figure and histogram below illustrate the prediction characteristics at a threshold of 0.51 and use the model without county feature.

testProbs %>%
  filter(County == "San Bernardino" | County == "Inyo" | County == "Kern"| County == "Riverside"| County == "Fresno" ) %>%
  filter(regression == "No County") %>%
  group_by(County) %>%
  count(predOutcome, Outcome) %>%
  summarize(sum_trueNeg = sum(n[predOutcome==0 & Outcome==0]),
                sum_truePos = sum(n[predOutcome==1 & Outcome==1]),
                sum_falseNeg = sum(n[predOutcome==0 & Outcome==1]),
                sum_falsePos = sum(n[predOutcome==1 & Outcome==0]),
            total=sum(n)) %>%
  mutate(True_Positive = sum_truePos / total,
         True_Negative = sum_trueNeg / total,
         False_Negative = sum_falseNeg / total,
         False_Positive = sum_falsePos / total,
         Accuracy = (sum_truePos + sum_trueNeg) / total) %>%
  select(County, True_Positive, True_Negative, False_Negative, False_Positive,Accuracy) %>%
  gather(key,value,True_Positive:Accuracy) %>%
    ggplot(.,aes(key,value,fill=County)) +
    geom_bar(aes(fill=County),position="dodge",stat="identity") +
    labs(title="Confusion matrix rates by top 5 County (51% threshold)",
         subtitle = "Use Model Without County",
         x="Outcome",y="Rate") +
  scale_fill_manual(values = palette5)+
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    plotTheme()

In the table, Accuracy means the ability to differentiate the fire occur and no fire correctly; Sensitivity means the percentage of all fishnets with fire occurred and the model correctly predicted to have fire; Specificity means percentage of all fishnets without fire occurred and the model correctly predicted no fire.

Though total accuracy are all above 0.8, sensitivity of Inyo and Riverside County is pretty low while specificity is extremely high which need our additional research on it.

testProbs %>%
  filter(County == "San Bernardino" | County == "Inyo" | County == "Kern"| County == "Riverside"| County == "Fresno" ) %>%
  filter(regression == "No County") %>%
  group_by(County) %>%
  count(predOutcome,Outcome,probs) %>%
  summarize(sum_trueNeg = sum(n[predOutcome==0 & Outcome==0]),
                sum_truePos = sum(n[predOutcome==1 & Outcome==1]),
                sum_falseNeg = sum(n[predOutcome==0 & Outcome==1]),
                sum_falsePos = sum(n[predOutcome==1 & Outcome==0]),
            total=sum(n),
         AUC = auc(Outcome,probs)) %>%
  mutate(True_Positive = sum_truePos / total,
      True_Negative = sum_trueNeg / total,
      False_Negative = sum_falseNeg / total,
      False_Positive = sum_falsePos / total,
      Accuracy = (sum_truePos + sum_trueNeg)/ total,
      Sensitivity = True_Positive / (True_Positive +True_Negative),
      Specificity = True_Negative /(True_Negative +False_Positive)) %>%
  select(County,Accuracy,Sensitivity,Specificity, AUC) %>%
  kable() %>%
  kable_styling("striped", full_width = F) %>%
    row_spec(c(1), bold = F, color = "white", background = "#3b668c") %>%
    row_spec(c(2), bold = F, color = "white", background = "#8bbbd9") %>%
    row_spec(c(3), bold = F, color = "white", background = "#485923") %>%
    row_spec(c(4), bold = F, color = "white", background = "#728c58") %>%
    row_spec(c(5), bold = F, color = "white", background = "#f2c791") 
County Accuracy Sensitivity Specificity AUC
Fresno 0.8297872 0.6410256 0.6363636 0.9490909
Inyo 0.8452381 0.1830986 0.9354839 0.7617302
Kern 0.8000000 0.7115385 0.7500000 0.8388889
Riverside 0.8906250 0.4385965 1.0000000 0.9228516
San Bernardino 0.8562874 0.0769231 0.9924812 0.7671384

4.5 Cross Validation

We use 100-fold cross validation to evaluate the generalizabilty of the model. The concentration of the results shows that our model peforms well in our dataset.

ctrl <- trainControl(method = "cv", number = 100, classProbs=TRUE, summaryFunction=twoClassSummary)

cvFit <- train(WildFire ~ ., data = fire_final %>% 
                                   dplyr::select(
                                   -Fire_Occur,
                                   -FishnetID,
                                   -County), 
               method="glm", family="binomial",
                metric="ROC", trControl = ctrl)
dplyr::select(cvFit$resample, -Resample) %>%
  gather(metric, value) %>%
  left_join(gather(cvFit$results[2:4], metric, mean)) %>%
  ggplot(aes(value)) + 
    geom_histogram(bins=35, fill = "#3b668c") +
    facet_wrap(~metric) +
    scale_x_continuous(limits = c(0, 1)) +
    labs(x="Goodness of Fit", y="Count", title="CV Goodness of Fit Metrics",
         subtitle = "Across-fold mean reprented as dotted lines")
## Warning: Removed 6 rows containing missing values (geom_bar).

5. Prediction Result

Last, we plot our prediction result on OSM map. The map below shows the predict fire risk of each fishnet which can be zoom infor detailed information.As it is shown here, the interface can be a reference for CA policymakers, planners and local residentsto get a picture of where is wildfire-risky and to what extend.


pred_fishnet <- data.frame(Outcome = fire_final$Fire_Occur,
               probs = predict(Model_noCounty, fire_final, type="prob")$FireOccur,
               predOutcome  = ifelse(predict(Model_noCounty, fire_final, type="prob")$FireOccur > 0.5 , 1, 0),
               ID = fire_final$FishnetID)
jieguo <- cbind(fire_final,pred_fishnet)

#write.csv(jieguo, file = "jieguo.csv")
library(leaflet)

predict_net84 <- st_read("./CA_fire/fishnet_all84.shp")
## Reading layer `fishnet_all84' from data source `C:\Users\M S I\Desktop\final\CA_fire\fishnet_all84.shp' using driver `ESRI Shapefile'
## Simple feature collection with 4312 features and 34 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: -124.4098 ymin: 32.53429 xmax: -114.1308 ymax: 42.0095
## epsg (SRID):    4326
## proj4string:    +proj=longlat +datum=WGS84 +no_defs
bins <- c(0, 0.24, 0.48, 0.71, 0.9, 1)
pal <- colorBin(
  palette = palette5,
  domain = predict_net84$prediction,
  bins = bins)

m <- leaflet(predict_net84) %>%
  addTiles()%>%
  setView(lng = -114.609, lat = 38.823, zoom = 5)

m %>% addPolygons(
  fillColor = ~pal(prediction),
  weight = 0.5,
  opacity = 0.6,
  color = "white",
  fillOpacity = 0.7)%>% 
  addLegend(pal = pal, values = ~prediction, opacity = 0.7, title = NULL,
  position = "bottomright")

6. Conclusion

Derived from our model, the map shows wildfires are spatially autocorrelated as we observed in our exploratory analysis section. Though we cconsidered all recent 120 years of fire records in California which seems to be a great dataset for prediction, historic wildfires may not have much relations to recent years due to climate change, human activity etc. Maybe consider more recent wildfires from 2010 can result to a different risk map.Anyway, wildfires are very special events for they are of small probability but cause gigantic damages.

We could see that regions with high fire risks are in coastal areas in Southern California and in Northern national parks. This is interesting because high-risk wildfire areas in Southern part locate near major urban areas while in northern part, they locate far from cities. This may be in southern part, cities are limited to basins near coast and the vast inner land are, as analysis above, vast desert; Northeastern area is mountainous region coverd with conifrous forest or oak woods. Still more research need to be done on this.

As generalizibility across counties, overally speaking, our model performs well on top 5 counties with high wildfire frequencies. However, percentage of prediction of true positive in some counties are not ideal like in Inyo county and Riverside. To be conservative, we can not say that our model performs well across all 58 counties in California, it just performs well for top 5. In some counties with very low frequency of wildfire like Orange, Napa, San Francisco etc, our model does not predict well.